{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "outputs": [],
   "source": [
    "# setting the environment variables\n",
    "import sys\n",
    "import os\n",
    "\n",
    "sys.path.insert(0, os.path.abspath('..'))\n",
    "\n",
    "from config import set_environment\n",
    "set_environment()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-07T17:47:50.699369Z",
     "start_time": "2024-08-07T17:47:50.692492Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [],
   "source": [
    "from langchain_community.document_loaders import WebBaseLoader\n",
    "\n",
    "loader = WebBaseLoader(\"https://www.amazon.com/product-reviews/B0CBBL55PQ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews\")\n",
    "docs = loader.load()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-09T12:16:45.276265Z",
     "start_time": "2024-04-09T12:16:44.273716Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Created a chunk of size 8673, which is longer than the specified 500\n",
      "Created a chunk of size 2119, which is longer than the specified 500\n",
      "Created a chunk of size 3894, which is longer than the specified 500\n",
      "Created a chunk of size 1458, which is longer than the specified 500\n",
      "Created a chunk of size 580, which is longer than the specified 500\n",
      "Created a chunk of size 1475, which is longer than the specified 500\n",
      "Created a chunk of size 1880, which is longer than the specified 500\n",
      "Created a chunk of size 3152, which is longer than the specified 500\n"
     ]
    }
   ],
   "source": [
    "from langchain_text_splitters import CharacterTextSplitter\n",
    "\n",
    "splitter = CharacterTextSplitter(\n",
    "    chunk_size=500,\n",
    "    chunk_overlap=0,\n",
    "    separator=\"\\nReport\"\n",
    ")\n",
    "split_docs = splitter.split_documents(docs)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-09T12:30:43.828459Z",
     "start_time": "2024-04-09T12:30:43.777132Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8636\n",
      "2115\n",
      "3890\n",
      "1454\n",
      "370\n",
      "576\n",
      "443\n",
      "1471\n",
      "1876\n",
      "3148\n",
      "2685\n"
     ]
    }
   ],
   "source": [
    "for doc in split_docs:\n",
    "    print(len(doc.page_content))"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-09T12:30:50.012512Z",
     "start_time": "2024-04-09T12:30:49.996067Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "outputs": [],
   "source": [
    "from langchain_community.vectorstores.chroma import Chroma\n",
    "from langchain_cohere import CohereEmbeddings\n",
    "\n",
    "vectorstore = Chroma.from_documents(documents=split_docs, embedding=CohereEmbeddings())"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-09T12:31:13.851321Z",
     "start_time": "2024-04-09T12:31:12.579456Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "outputs": [],
   "source": [
    "similar_vectors = vectorstore.similarity_search(\"This is a fantastic book!\", 1)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-09T12:31:17.465440Z",
     "start_time": "2024-04-09T12:31:17.047704Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Showing 0 comments\n",
      "There was a problem loading comments right now. Please try again later.MZ5.0 out of 5 stars\n",
      "Great Book!\n",
      "Reviewed in the United States on January 1, 2024Verified Purchase\n",
      "This book has substance and interesting material, best book so far in the market covering langchain. The author knows his stuff!\n",
      "\n",
      "2 people found this helpful\n",
      "\n",
      "\n",
      "              Helpful\n"
     ]
    }
   ],
   "source": [
    "print(similar_vectors[0].page_content)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-09T12:37:16.480514Z",
     "start_time": "2024-04-09T12:37:16.469379Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "outputs": [],
   "source": [
    "from langchain_community.document_loaders.wikipedia import WikipediaLoader\n",
    "loader = WikipediaLoader(\"LangChain\")\n",
    "documents = loader.load()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-09T12:42:30.345785Z",
     "start_time": "2024-04-09T12:41:59.882507Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "outputs": [
    {
     "data": {
      "text/plain": "'LangChain is a framework designed to simplify the creation of applications using large language models'"
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents[0].page_content[:102]"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-09T12:45:07.397353Z",
     "start_time": "2024-04-09T12:45:07.376697Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "from langchain_community.retrievers import KNNRetriever\n",
    "from langchain_openai import OpenAIEmbeddings\n",
    "\n",
    "words = [\"cat\", \"dog\", \"computer\", \"animal\"]\n",
    "retriever = KNNRetriever.from_texts(words, OpenAIEmbeddings())\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-07T17:48:32.266706Z",
     "start_time": "2024-08-07T17:48:25.145084Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "outputs": [
    {
     "data": {
      "text/plain": "[Document(page_content='dog'),\n Document(page_content='animal'),\n Document(page_content='cat'),\n Document(page_content='computer')]"
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "retriever.get_relevant_documents(\"dog\")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-09T12:52:25.909999Z",
     "start_time": "2024-04-09T12:52:25.603103Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Healthcare workers' experiences protecting themselves and their families during the COVID-19 pandemic in 2020-2021.\n",
      "Low-Dose Direct Oral Anticoagulation vs Dual Antiplatelet Therapy After Left Atrial Appendage Occlusion: The ADALA Randomized Clinical Trial.\n",
      "Improving Transparency of Decision Models Through the Application of Decision Analytic Models with Omitted Objects Displayed (DAMWOOD).\n"
     ]
    }
   ],
   "source": [
    "from langchain_community.retrievers.pubmed import PubMedRetriever\n",
    "\n",
    "retriever = PubMedRetriever()\n",
    "documents = retriever.get_relevant_documents(\"COVID\")\n",
    "for document in documents:\n",
    "    print(document.metadata[\"Title\"])\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-07T17:50:42.070767Z",
     "start_time": "2024-08-07T17:50:39.123072Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
